This is the data that Victor M generated for the predicted and novel miRNA. I’ve analyzed miRNA expression by
+ Neural Tube Defects’ (NTDs) status
+ Sex
+ Trimester
+ Ancestry (TBD)
Figures and plots to be made
sum_stats <- pDat[c(6,9,10,28)]
tab1 <- tableby(trimester~., data = sum_stats, test =FALSE) #package: arsenal --> cannot use with %>%
summary (tab1, title ="Breakdown of Samples by Trimester")| 1 (N=5) | 2 (N=16) | 3 (N=9) | Total (N=30) | |
|---|---|---|---|---|
| condition | ||||
| con | 5 (100.0%) | 10 (62.5%) | 9 (100.0%) | 24 (80.0%) |
| NTD | 0 (0.0%) | 6 (37.5%) | 0 (0.0%) | 6 (20.0%) |
| sex | ||||
| FEMALE | 3 (60.0%) | 10 (62.5%) | 4 (44.4%) | 17 (56.7%) |
| MALE | 2 (40.0%) | 6 (37.5%) | 5 (55.6%) | 13 (43.3%) |
| 450k.array | ||||
| Match | 0 (0.0%) | 15 (93.8%) | 9 (100.0%) | 24 (80.0%) |
| No Match | 5 (100.0%) | 1 (6.2%) | 0 (0.0%) | 6 (20.0%) |
library(devtools)
install_github("vqv/ggbiplot")
library(ggbiplot)
library(limma)
#str(t_edat)
t_edat <-t_edat %>%
mutate_all(funs(as.character)) %>%
mutate_all(funs(as.numeric))
rn2 <- colnames(eDat)
rn2 <- rn2[c(-1)]
row.names(t_edat) <- rn2
paged_table(t_edat)## [1] TRUE
desmat_cond <- model.matrix(~condition, pDat) #linear modelling on condition --> controls vs NTD
con_mat <- lmFit(t_edat, desmat_cond)
con_mat_eb <- eBayes(con_mat)
diff_condition_genes <- topTable(con_mat_eb, n = Inf)
#Controlling for Sex
desmat_cond_sex <- model.matrix(~condition + sex, pDat) #conditionmodel controlling for sex
con_sex_mat <- lmFit(t_edat, desmat_cond_sex)
con_sex_mat_eb <- eBayes(con_sex_mat)
con_sex_mat_genes <- topTable(con_sex_mat_eb, n = Inf)
#Controlling for GA
desmat_cond_ga <- model.matrix(~condition + GA, pDat) #condition model controlling for GA
con_ga_mat <- lmFit(t_edat, desmat_cond_ga)
con_ga_mat_eb <- eBayes(con_ga_mat)
con_ga_mat_genes <- topTable(con_ga_mat_eb, n = Inf, adjust.method = "fdr", sort.by="B", resort.by = "logFC")(Univariate Analysis)
Considering only Trimester 2 Samples:
tri2p <- pDat %>% #trimester 2 phenotype
filter (trimester == "2")
tri2 <- as.character(tri2p$sample) #select sample names of trimester 2 --> make value
#tri2g <- t_edat %>% #this dropped the trimester 2 samples and kept the trimester 1 and 3 --> we want the opposite
# modify_at(tri2,~NULL)
tri2g <- t_edat %>%
select(which(names(t_edat) %in% tri2))
(tri2p$sample == names(tri2g)) %>% all()## [1] TRUE
tri2_mat <- model.matrix(~condition + sex, tri2p) #desmat for trimester 2 samples modelling for condition controllinf for sex
tri2_mat_sex <- lmFit(tri2g, tri2_mat)
tri2_mat_eb <- eBayes(tri2_mat_sex)## Warning in fitFDist(var, df1 = df, covariate = covariate): More than half of
## residual variances are exactly zero: eBayes unreliable
## Removing intercept from test coefficients
Now here, we need to specify that we want to consider condition as our variable or coefficient of interest. Otherwise, the function relies on the default intercept which would give us different results:
diff_tri_sex <- topTable(tri2_mat_eb, n = Inf, adjust.method = "fdr", coef = 2, p.value = 0.05) #specifying variable of interest as control/NTD from tri2_mat column 2
paged_table(diff_tri_sex)(Multivariate Analysis):
tri23p <- pDat %>%
filter (!trimester == "1")
tri23 <- as.character(tri23p$sample)
tri23g <- t_edat %>%
select(which(names(t_edat) %in% tri23))
(tri23p$sample == names(tri23g)) %>% all()## [1] TRUE
tri23_mat <- model.matrix(~sex + trimester, tri23p) #desmat for trimester 2 and 3 samples modelling for sex controlling by trimester --> GA doesn't work since matrix larger than no of samples, and no residual degress of freedom
tri23genes_sex <- lmFit(tri23g, tri23_mat) %>%
eBayes()## Warning in fitFDist(var, df1 = df, covariate = covariate): More than half of
## residual variances are exactly zero: eBayes unreliable
tri23_list <- topTable(tri23genes_sex, n = Inf, adjust.method = "fdr", coef = 2, p.value = 0.05) #differentially expressed by sex in trimester 2 and 3 in controls and NTDs
paged_table(tri23_list)tri23_list_tri3 <- topTable(tri23genes_sex, n = Inf, adjust.method = "fdr", coef = 3, p.value = 0.05) #differentially expressed in trimester 3 in controls and NTDs
nrow(tri23_list_tri3)## [1] 80
Making contrasts manually as opposed to implicit determinimation using lmFit:
However, since there is only variable we’re considering above (sex), we don’t really need to use a contrast matrix. (Specifying coef = 2 in tri23genes_sex isn’t going to make a difference since there is only one variable and we aren’t controlling for confoudnders or covariates)
## Removing intercept from test coefficients
Now, if we actually applied the same coef fucntion to consider sex as the variable of interest in the Controls vs NTDs in Trimester 2 samples analysis above, we get the same significantly differentially expressed miRNA by sex as when looking at Sex-Differential Genes in Trimester 2 and 3:
diff_tri_sex <- topTable(tri2_mat_eb, n = Inf, adjust.method = "fdr", coef = 3, p.value = 0.05)
paged_table(diff_tri_sex)controls_p <- pDat %>%
filter (condition == "con")
control_samp <- as.character(controls_p$sample)
control_g <- t_edat %>%
select(which(names(t_edat) %in% control_samp))
(controls_p$sample == names(control_g)) %>% all()## [1] TRUE
controls_mat <- model.matrix(~trimester + sex, controls_p) #desmat for all controls modelling for trimester controlling by sex
controls_genes <- lmFit(control_g, controls_mat) %>%
eBayes()## Warning in fitFDist(var, df1 = df, covariate = covariate): More than half of
## residual variances are exactly zero: eBayes unreliable
controls_tri2_list <- topTable(controls_genes, n = Inf, adjust.method = "fdr", coef = 2, p.value = 0.05) #genes differentially expressed in controls at trimester 2
nrow(controls_tri2_list)## [1] 97
controls_tri3_list <- topTable(controls_genes, n = Inf, adjust.method = "fdr", coef = 3, p.value = 0.05) #genes differentially expressed in controls at trimester 3
nrow(controls_tri3_list)## [1] 153
controls_sex_list <- topTable(controls_genes, n = Inf, adjust.method = "fdr", coef = 4, p.value = 0.05) #genes differentially expressed in controls by sex --> none?
controls_sex_list## data frame with 0 columns and 0 rows